1 Figure 5A


1.1 top

cd /data1/qtl/code_for_submit/data/Fig5A

R
rm(list = ls())
library(data.table)
library(ggplot2)
conditional_res <- fread('NSCLC.conditional.chr2.loc_1.GWAS.condition.intron.csv')
conditional_res <- subset(conditional_res,!is.na(pv))
conditional_res$POS <- as.integer(sapply(strsplit(conditional_res$snp,':'),'[',2))
start <- sapply(strsplit('2:61372331:61389977:clu_27445_+:ENSG00000162929.9',':'),'[',2)
end <- sapply(strsplit('2:61372331:61389977:clu_27445_+:ENSG00000162929.9',':'),'[',3)
min.limit <- mean(c(as.numeric(start),as.numeric(end))) - 300000
max.limit <- mean(c(as.numeric(start),as.numeric(end))) + 300000
conditional_res <- subset(conditional_res, (POS > min.limit) & (POS < max.limit) & !is.na(pv))
rawgwas <- conditional_res[,c('POS','pv')]
condgwas <- conditional_res[,c('POS','cond.pv')]
colnames(condgwas) <- c('POS','pv')
rawgwas$note <- 'Main GWAS'
condgwas$note <- paste0('Conditioned on\npredictor effect')
dat1 <- rbind(rawgwas,condgwas)
dat1 <- subset(dat1,!is.na(pv))
len <- max(-log10(dat1$pv))
len <- ifelse(len<5,5,len)
pdf(file = 'Figure 5A-top.pdf',width=6.89, height=2.63)
ggplot(data=dat1, aes(x=POS/1e6, y=-log10(pv))) + 
  geom_point(aes(color = note), pch=16, size=2.25) + theme_classic() +
  scale_color_manual(values = rev(c("gray","#2D417F")),name = '')   +           
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank()) +
  xlab('Chr2 physical position (Mb)') +
  ylab(expression(-log[10]*italic("P"))) +scale_y_continuous(breaks = seq(0,len,len%/%5),labels = seq(0,len,len%/%5), expand=c(0.05, 0)) + scale_x_continuous(expand = c(0, 0)) +
  theme(axis.text.y = element_text(size=15, color = "black"), axis.text.x = element_text(size=15, color = "black"), axis.title.y = element_text(size=15, color = "black"), axis.title.x = element_text(size=15, color = "black")) +
  theme(axis.line = element_line(colour = 'black', size = 0.3))+ geom_point(data = subset(dat1, POS == 61380622)[1,], aes(x=POS/1e6, y=-log10(pv)), color = "#7F00BA", pch=16, size=2.25) + theme(legend.key.size = unit(0.5, 'cm')) + 
  geom_text(data = subset(dat1,POS==61380622)[1,],aes(x = POS/1e6+0.05 , y = -log10(pv), label = 'rs111543747')) +theme(legend.position = c(0.15, 0.8),legend.title = element_blank(),legend.text=element_text(size=12),legend.background = element_rect(fill=F,linetype="solid", size=0.1,colour ="black"))
dev.off()

1.2 bottom

dats <- read.table('chr2_condition.top-QTL.txt',h=T)
dats$pos <- as.numeric(sapply(strsplit(dats$SNP,':'),'[',2))
dat <- subset(dats, (pos > min.limit) & (pos < max.limit) )
rawgwas <- dat[,c('pos','p')]
condgwas <- dat[,c('pos','pC')]
colnames(condgwas) <- colnames(rawgwas)
rawgwas$note <- 'Main eQTL'
condgwas$note <- 'Conditioned on\ntop eQTL'
dat1 <- rbind(rawgwas,condgwas)
dat1 <- subset(dat1,!is.na(p))
len <- max(-log10(na.omit(dat1$p)))
len <- ifelse(len<5,5,len)
pdf(file = 'Figure 5A-bottom',width=6.89, height=2.63)
ggplot(data=dat1, aes(x=pos/1e6, y=-log10(p))) + 
  geom_point(aes(color = note), pch=16, size=2.25) + theme_classic() +
  scale_color_manual(values = rev(c("gray","#2D417F")),name = '') +         
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank()) +
  xlab('Chr2 physical position (Mb)') +
  ylab(expression(-log[10]*italic("P"))) + scale_y_continuous(breaks = seq(0,len,len%/%5),labels = seq(0,len,len%/%5), expand=c(0.05, 0)) + scale_x_continuous(expand = c(0, 0)) +
  theme(axis.text.y = element_text(size=15, color = "black"), axis.text.x = element_text(size=15, color = "black"), axis.title.y = element_text(size=15, color = "black"), axis.title.x = element_text(size=15, color = "black")) +
  theme(axis.line = element_line(colour = 'black', size = 0.3)) + theme(legend.key.size = unit(0.5, 'cm')) + geom_point(data = subset(dat1, pos == 61380622)[1,], aes(x=pos/1e6, y=-log10(p)), color = "#7F00BA", pch=16, size=2.25) + theme(legend.key.size = unit(0.5, 'cm')) + 
  geom_text(data = subset(dat1,pos==61380622)[1,],aes(x = pos/1e6+0.05 , y = -log10(p), label = 'rs111543747')) +theme(legend.position = c(0.15, 0.8),legend.title = element_blank(),legend.text=element_text(size=12),legend.background = element_rect(fill=F,linetype="solid", size=0.1,colour ="black"))
dev.off()   


AI中组合图片并修改细节







2 Figure 5B


2.1 top

cd /data1/qtl/code_for_submit/data/Fig5B
awk -F'\t' '$17 == 2 && ($18 > 61380622 - 200000) && ($18< 61380622 + 200000)' /data1/qtl/3QTL/enrichment/gwas/summary_data/3_China_pop_NSCLC_updated_0916_with_chr_bp.txt >> NSCLC_dat

/Public/jinchen/software/locuszoom/bin/locuszoom \
--metal NSCLC_dat \
--refsnp rs111543747 --markercol rsID --pvalcol P-value --delim tab \
--flank 200kb --plotonly \
--pop ASN --build hg19 --source 1000G_March2012 --svg --prefix NSCLC

2.2 median

/Public/jinchen/software/locuszoom/bin/locuszoom \
--metal sinput --ignore-vcf-filter --ld-vcf /Public/jinchen/software/locuszoom/data/1000G/genotypes/vcf/338Genotype_use_update_rsid.vcf.gz \
--refsnp rs111543747 --markercol name --pvalcol P --delim tab \
--flank 200kb --no-cleanup --plotonly \
--pop ASN --build hg19 --source 1000G_March2012 --prefix sqtl

2.3 bottom

awk '$1 == "ENSG00000162929.9"' ../../data/338sample_eqtl_id_p_chr_bp.txt > einput
/Public/jinchen/software/locuszoom/bin/locuszoom \
--metal einput --ignore-vcf-filter --ld-vcf /Public/jinchen/software/locuszoom/data/1000G/genotypes/vcf/338Genotype_use_update_rsid.vcf.gz \
--refsnp rs111543747 --markercol name --pvalcol P --delim tab \
--flank 200kb --no-cleanup --plotonly \
--pop ASN --build hg19 --source 1000G_March2012 --prefix eqtl


AI中组合图片并修改细节







3 Figure 5C


cd /data1/qtl/code_for_submit/data/Fig5C-D
R
rm(list=ls())
library(data.table)
library(tidyfst) 
library(coloc) 
library(ggrepel)
library(locuscomparer) 
library(ggtext)
snp <- fread('../116_geno005_maf001_hwe06.bim_id_query_rsid')
names(snp)[2] <- 'SNP'
dat <- parse_fst('sQTL_NSCLC.fst')
dat1 <- as.data.frame(dat)
dbsnp<-subset(dat1,dat1$gene_id=='2:61372331:61389977:clu_27445_+:ENSG00000162929')
dbsnp$SNP <- paste0(sapply(strsplit(dbsnp$SNP,':'),'[',1),':',sapply(strsplit(dbsnp$SNP,':'),'[',2),':',sapply(strsplit(dbsnp$SNP,':'),'[',3),'/',sapply(strsplit(dbsnp$SNP,':'),'[',4))
dbsnp <- dplyr::left_join(dbsnp,snp[,c('SNP','name')],by = 'SNP')
dbsnp <- na.omit(dbsnp)
dbsnp <- dbsnp[order(dbsnp$pval_nominal,dbsnp$P),]
dbsnp <- subset(dbsnp,!duplicated(name))
dbsnp$p_merge <- dbsnp$pval_nominal + dbsnp$P
snp_ld <- retrieve_LD('2', dbsnp[which(dbsnp$p_merge==min(dbsnp$p_merge)),]$name , 'EAS')
names(snp_ld)[2] <- 'name'
dbsnp1 <- subset(dbsnp,bp<=(as.integer(dbsnp[which(dbsnp$p_merge==min(dbsnp$p_merge)),]$bp) + 500000) & bp>=(as.integer(dbsnp[which(dbsnp$p_merge==min(dbsnp$p_merge)),]$bp) - 500000))
dbsnp1 <- dplyr::left_join(dbsnp1,snp_ld[,c('name','R2')],by = 'name')
dbsnp1$R2[which(dbsnp1$name==dbsnp[which(dbsnp$p_merge==min(dbsnp$p_merge)),]$name )] <- 1
dbsnp1$R2 <- ifelse(is.na(dbsnp1$R2),0,dbsnp1$R2)
dbsnp1$ld_group <- ifelse(dbsnp1$R2<=0.2,'r02',ifelse(dbsnp1$R2>0.2 & dbsnp1$R2<=0.4,'r04',ifelse(dbsnp1$R2>0.4 & dbsnp1$R2<=0.6,'r06',ifelse(dbsnp1$R2>0.6 & dbsnp1$R2<=0.8,'r08','r1'))))
dbsnp1$ld_group <- factor(dbsnp1$ld_group,levels = c('r02','r04','r06','r08','r1'))
RValue <- cor.test(-log10(dbsnp1$P), -log10(dbsnp1$pval_nominal), exact=F)$estimate
PValue <- cor.test(-log10(dbsnp1$P), -log10(dbsnp1$pval_nominal), exact=F)$p.value
labels <- paste("Pearson correlation", "\nr =", format(RValue, digits = 3),"\nP =", format(PValue, scientific = TRUE, digits = 3))
color_map <- c("#5a668f", "#7ea8be", "#6bc4c6", "#e39c4d", "#a23b72")
p <- ggplot(dbsnp1, aes(x = -log10(P), y = -log10(pval_nominal))) +
  geom_point(data = subset(dbsnp1, name != 'rs111543747'),
             aes(color = ld_group), 
             shape = 19, size = 3) +
  geom_point( data = subset(dbsnp1, name == 'rs111543747'),
              shape = 21, fill = "#D73027", size = 3) +
  scale_color_manual(values = color_map, drop = FALSE) +
  labs(x = "GWAS -log10(P)",
       y = "KIAA1841 eQTL -log10(P)",
       fill = expression(r^2) ) +
  geom_text_repel(data = subset(dbsnp1, name == 'rs111543747'),
                  aes(label = 'rs111543747'), 
                  size = 4, direction = "both", point.padding = 1.5, 
                  max.overlaps = 50, segment.color = NA) +
  annotate("text", x = 0.3, y = Inf, label = labels, 
           hjust = 0, vjust = 1.3, size = 4) +
  theme_classic() + theme(panel.border = element_rect(linewidth = 0.5, colour = "black", fill = NA),
                          axis.text.y = element_text(size = 12, color = "black", margin = margin(r = 6)),
                          axis.text.x = element_text(size = 12, color = "black", margin = margin(t = 6)),
                          axis.ticks.length = unit(-0.15, "cm"),
                          axis.title.y = element_markdown(size = 12, color = "black"),
                          axis.title.x = element_markdown(size = 12, color = "black"),
                          axis.line = element_line(linewidth = 0.5, colour = "black"),
                          legend.position = "right",legend.key.height = unit(0.5, "cm")) +xlim(0, 4) + ylim(0, 26) +
  guides(fill = guide_legend(override.aes = list(shape = 22, size = 8, fill = color_map, color = "black")))
ggsave('Figure 5C.pdf', p, width = 5, height = 3)


AI中修改细节







4 Figure 5D


cd /data1/qtl/code_for_submit/data/Fig5C-D

R
rm(list=ls())
library(data.table)
library(tidyfst) 
library(coloc) 
library(ggrepel)
library(locuscomparer) 
library(ggtext)
snp <- fread('../116_geno005_maf001_hwe06.bim_id_query_rsid')
names(snp)[2] <- 'SNP'
system('zgrep ENSG00000162929 /data1/qtl/test/338_QTL/338_eQTL/338sample_eqtl.txt.gz >qtl_res')
dat <- parse_fst('NSCLC_gwas_geno.fst')
dat1 <- as.data.frame(dat)
qtl_res <- fread('qtl_res',h = F)
names(qtl_res) <- c('gene_id','variant_id','tss_distance','ma_samples','ma_count','maf','pval_nominal','slope','slope_se')
qtl_res$gene_id <- substring(qtl_res$gene_id,1,15)
names(qtl_res)[2] <- 'MarkerName'
dbsnp <- dplyr::left_join(qtl_res,dat1[,c('MarkerName','chr','bp','P')],by = 'MarkerName')
names(dbsnp)[2] <- 'SNP'
dbsnp <- dplyr::left_join(dbsnp,snp[,c('SNP','name')],by = 'SNP')
dbsnp <- na.omit(dbsnp)
dbsnp <- dbsnp[order(dbsnp$pval_nominal,dbsnp$P),]
dbsnp <- subset(dbsnp,!duplicated(name))
dbsnp$p_merge <- dbsnp$pval_nominal + dbsnp$P
snp_ld <- retrieve_LD('1', dbsnp[which(dbsnp$p_merge==min(dbsnp$p_merge)),]$name , 'EAS')
names(snp_ld)[2] <- 'name'
dbsnp1 <- subset(dbsnp,bp<=(as.integer(dbsnp[which(dbsnp$p_merge==min(dbsnp$p_merge)),]$bp) + 500000) & bp>=(as.integer(dbsnp[which(dbsnp$p_merge==min(dbsnp$p_merge)),]$bp) - 500000))
dbsnp1 <- dplyr::left_join(dbsnp1,snp_ld[,c('name','R2')],by = 'name')
dbsnp1$R2[which(dbsnp1$name==dbsnp[which(dbsnp$p_merge==min(dbsnp$p_merge)),]$name )] <- 1
dbsnp1$R2 <- ifelse(is.na(dbsnp1$R2),0,dbsnp1$R2)
dbsnp1$ld_group <- ifelse(dbsnp1$R2<=0.2,'r02',ifelse(dbsnp1$R2>0.2 & dbsnp1$R2<=0.4,'r04',ifelse(dbsnp1$R2>0.4 & dbsnp1$R2<=0.6,'r06',ifelse(dbsnp1$R2>0.6 & dbsnp1$R2<=0.8,'r08','r1'))))
dbsnp1$ld_group <- factor(dbsnp1$ld_group,levels = c('r02','r04','r06','r08','r1'))
RValue <- cor.test(-log10(dbsnp1$P), -log10(dbsnp1$pval_nominal), exact=F)$estimate
PValue <- cor.test(-log10(dbsnp1$P), -log10(dbsnp1$pval_nominal), exact=F)$p.value
labels <- paste("Pearson correlation", "\nr =", format(RValue, digits = 3),"\nP =", format(PValue, scientific = TRUE, digits = 3))
color_map <- c("#5a668f", "#7ea8be", "#6bc4c6", "#e39c4d", "#a23b72")
p <- ggplot(dbsnp1, aes(x = -log10(P), y = -log10(pval_nominal))) +
  geom_point(data = subset(dbsnp1, name != 'rs111543747'),
             aes(color = ld_group), 
             shape = 19, size = 3) +
  geom_point( data = subset(dbsnp1, name == 'rs111543747'),
              shape = 21, fill = "#D73027", size = 3) +
  scale_color_manual(values = color_map, drop = FALSE) +
  labs(x = "GWAS -log10(P)",
       y = "KIAA1841 eQTL -log10(P)",
       fill = expression(r^2) ) +
  geom_text_repel(data = subset(dbsnp1, name == 'rs111543747'),
                  aes(label = 'rs111543747'), 
                  size = 4, direction = "both", point.padding = 1.5, 
                  max.overlaps = 50, segment.color = NA) +
  annotate("text", x = 0.3, y = Inf, label = labels, 
           hjust = 0, vjust = 1.3, size = 4) +
  theme_classic() + theme(panel.border = element_rect(linewidth = 0.5, colour = "black", fill = NA),
                          axis.text.y = element_text(size = 12, color = "black", margin = margin(r = 6)),
                          axis.text.x = element_text(size = 12, color = "black", margin = margin(t = 6)),
                          axis.ticks.length = unit(-0.15, "cm"),
                          axis.title.y = element_markdown(size = 12, color = "black"),
                          axis.title.x = element_markdown(size = 12, color = "black"),
                          axis.line = element_line(linewidth = 0.5, colour = "black"),
                          legend.position = "right",legend.key.height = unit(0.5, "cm")) +xlim(0, 4) + ylim(0, 26) +
  guides(fill = guide_legend(override.aes = list(shape = 22, size = 8, fill = color_map, color = "black")))
ggsave('Figure 5D.pdf', p, width = 5, height = 3)


AI中修改细节







5 Figure 5E


cd /data1/qtl/code_for_submit/data/Fig5E

R
options(stringsAsFactors=F)
library(ggplot2)
library(ggraph)
library(ggnewscale)
library(data.table)
rm(list = ls())
chr_name <- 'chr2'
gen_name <- 'KIAA1841'
snp_name <- 'rs111543747'
snp_chr <- 2
snp_pos <- 61380622
plot_min <- 59379000
clu_name <- 'clu_27445_+'
trans_mm <- read.csv('Mean_Median_Transcript_Expression_338_tpm.csv')
head(trans_mm)
nrow(trans_mm) 
trans_mm <- subset(trans_mm, median_expr > 0.1)
nrow(trans_mm)
trans_gen <- read.csv('KIAA1841_gencode.v19.annotation.csv')
head(trans_gen)
trans_gen$ID <- paste0(trans_gen$gene_id,'_',trans_gen$transcript_id)
colnames(trans_gen)[c(4,5,7)] <- c('start','end','strand')
trans_gen <- trans_gen[order(trans_gen$start,decreasing=T),]
head(trans_gen)
trans_gen <- subset(trans_gen, ID %in% trans_mm$ID) 
length(unique(trans_gen$ID))
transcript_seq <- data.frame(transcript_name=unique(trans_gen$transcript_name), yorder=1:length(unique(trans_gen$transcript_name)))
dat <- merge(trans_gen, transcript_seq, by='transcript_name')
dat$ymin <- dat$yorder - 0.2
dat$ymax <- dat$yorder + 0.2
trans_mm <- merge(trans_mm, dat[!duplicated(dat$ID),c('ID','transcript_name','transcript_id','transcript_type','yorder','ymin','ymax')],by='ID')
nrow(trans_mm) 
tran_pos <- data.frame(yorder=1:length(unique(dat$yorder)),xmin=as.matrix(by(dat$start,dat$yorder,min)),xmax=as.matrix(by(dat$start,dat$yorder,max)))
ano <- merge(trans_mm, tran_pos, by='yorder')
clu <- read.delim('Lung.phenotype_groups.add',h=F)
head(clu)
clu$clu <- sapply(strsplit(clu$V1,':'),'[',4)
clu <- subset(clu, clu==clu_name)
clu$start <- as.numeric(sapply(strsplit(clu$V1,':'),'[',2))
clu$end <- as.numeric(sapply(strsplit(clu$V1,':'),'[',3))
sda <- subset(dat, (start%in%clu$start) | (start%in%clu$end) | (end%in%clu$start) | (end%in%clu$start) ) 
table(paste(sda$start, sda$end,sep='-'))
int_win <- max(sda$end) - min(sda$start)
s_xmin <- min(sda$start) - int_win/20
s_xmax <- max(sda$end) + int_win/20
x_min <- min(ano$xmin)
x_max <- max(ano$xmax)
xrange_whole <- x_max-x_min
p_transcript <- ggplot() + xlim(c(x_min-15000, x_max+50000)) +
  geom_segment(data=ano,
               aes(x = xmin, y = yorder, xend = xmax, yend = yorder),colour = "#CCCCCC",size=1) +
  geom_rect(data=dat,
            aes(xmin = start, xmax = end, ymin = ymin, ymax = ymax, fill = transcript_type),
            alpha = 1) +
  scale_fill_manual(values = c("#3F76B4", "#A6A6A6", "#57B2AB", "#C32B4A", "#5E4FA2","#EB6046"), 
                    name="Type of \nTranscript",
                    breaks=c("protein_coding", "retained_intron", "processed_transcript", "nonsense_mediated_decay"),
                    labels=c("Protein coding", "Retained intron", "Processed transcript", "Nonsense mediated decay")) +
  geom_text(data=subset(ano, yorder > 2),aes(x = xmin-10000, y = yorder, label=gsub(paste0(gen_name,'-'),'',transcript_name)),size=3) +
  geom_text(data=subset(ano, yorder <= 2),aes(x = xmax+10000, y = yorder, label=gsub(paste0(gen_name,'-'),'',transcript_name)),size=3) +
  new_scale_color() +
  new_scale_fill() +
  geom_point(data=ano,
             aes(x=x_max+27000,y=as.numeric(yorder),color = median_expr),shape=16,size=4)  +
  scale_color_gradient(low = "#FFCCCC", high = "#CC0000", name='Median Expression') +
  theme_classic() + 
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        axis.line = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank()) + xlab('') + ylab('')
snp <- read.table(paste0('./EAS_result_',snp_name,'.ld'),h=T)
snp_tag <- data.frame()
for (i in 1:nrow(trans_gen)) {
  snp_tag1 <- subset(snp,BP_B>trans_gen$start[i] & BP_B<trans_gen$end[i])
  snp_tag <- rbind(snp_tag,snp_tag1)}
p_trans_add <- p_transcript + 
  geom_text(aes(x = x_max+22000, y = nrow(ano)+2, label="Median \nExpression"),hjust = 0,size=3) +
  geom_segment(data=snp,
               aes(x = BP_B, y = nrow(ano)+0.6, xend = BP_B, yend = nrow(ano)+1.2),color='#006600',size=0.1) +
  geom_segment(aes(x = x_min-5000, y = nrow(ano)+2, xend = x_max+5000, yend = nrow(ano)+2),
               arrow = arrow(length = unit(0.2, "cm"),type = "closed")) +
  geom_segment(
    aes(x = seq(plot_min,x_max,50000), y = nrow(ano)+2, xend = seq(plot_min,x_max,50000), yend = nrow(ano)+2.2)) +  
  geom_text(aes(x = seq(plot_min,x_max,50000), y = nrow(ano)+2.4), label=paste(round(seq(plot_min,x_max,50000)/1e3,1),'Kb',sep=''),size=3) 
r <- s_xmax 
l <- s_xmin 
r1 <- x_max + 0.01*xrange_whole
l1 <- mean(c(x_min,x_max)) - 0.15*xrange_whole
p_trans_add_2 <- p_trans_add +
  geom_segment(data=data.frame(x=c(l,r), 
                               y=c(0.3,0.3), 
                               xend=c(l,r), 
                               yend=rep(nrow(ano)+0.8,2)),
               aes(x = x, y = y, xend = xend, yend = yend),color='#3C3C3C',size=0.5,linetype='dashed') + 
  geom_segment(data=data.frame(x=c(l,r), 
                               y=c(0.3,0.3), 
                               xend=c(l1,r1), 
                               yend=c(-1,-1)),
               aes(x = x, y = y, xend = xend, yend = yend),color='#3C3C3C',size=0.5,linetype='dashed')
snp$rsid <- snp$SNP_B
snp <- subset(snp, BP_B > l & BP_B < r)
nrow(snp)           
snp <- snp[order(snp$BP_B),]
snp$pos.grp <- cut(snp$BP_B, seq( min(snp$BP_B)-100,max(snp$BP_B)+100,0.2*(max(snp$BP_B) - min(snp$BP_B))), labels=1:5)
snp$pos.grp <- as.numeric(snp$pos.grp)
snp$pos.grp <- ifelse(is.na(snp$pos.grp), max(snp$pos.grp, na.rm=T), snp$pos.grp)
snp_use <- data.frame()
for(i in unique(snp$pos.grp)){
  ssnp <- subset(snp, pos.grp == i)
  ssnp$y <- (-1*(1:nrow(ssnp)))-1
  snp_use <- rbind(snp_use, ssnp)}
snp_use$ymin <- snp_use$y - 0.3
snp_use$ymax <- snp_use$y + 0.3              
sda <- subset(dat, !(end < s_xmin | start > s_xmax))
sano <- subset(ano, transcript_name %in% sda$transcript_name )
sano$xmin <- ifelse(sano$xmin < s_xmin,s_xmin,sano$xmin)
sano$xmax <- ifelse(sano$xmax > s_xmax,s_xmax,sano$xmax)
sano <- sano[order(sano$yorder),]
sano$yorder <- rev(-1*(1:nrow(sano))-2.5) 
sda <- sda[,colnames(sda)!='yorder']
sda <- merge(sda,sano[,c('transcript_name','yorder')],by='transcript_name')
nrow(sda)
sda$ymin <- sda$yorder - 0.3
sda$ymax <- sda$yorder + 0.3
sano$xmin <- r1 - ((r - sano$xmin)*(r1-l1)/(r-l))
sano$xmax <- r1 - ((r - sano$xmax)*(r1-l1)/(r-l))
sda$start <- r1 - ((r - sda$start)*(r1-l1)/(r-l))
sda$end <- r1 - ((r - sda$end)*(r1-l1)/(r-l))
snp_use$bp <-   r1 - ((r - snp_use$BP_B)*(r1-l1)/(r-l))
snp_list <- c('rs111543747','rs1665258')
snp_use <- subset(snp_use,SNP_B%in%snp_list)
snp_use$y <- -2
p_trans_add_3 <- p_trans_add_2 +
  geom_segment(data=sano,aes(x = xmin, y = yorder, xend = xmax, yend = yorder),colour = "#CCCCCC",size=1) +
  new_scale_color() +
  geom_rect(data=sda,aes(xmin = start, xmax = end, ymin = ymin, ymax = ymax, fill = transcript_type),alpha = 1, show.legend = FALSE) +
  scale_fill_manual(values = c("#3F76B4", "#A6A6A6", "#57B2AB", "#C32B4A", "#5E4FA2","#EB6046"), name="Type of \nTranscript",
                    breaks=c("protein_coding", "retained_intron", "processed_transcript", "nonsense_mediated_decay"),
                    labels=c("Protein coding", "Retained intron", "Processed transcript", "Nonsense mediated decay")) + 
  geom_text(data=sano,aes(x = xmin-0.05*(r1 - l1), y = yorder, label=gsub(paste0(gen_name,'-'),'',transcript_name)),size=3) +
  geom_segment(data=snp_use,aes(x = bp, y = y+1.2, xend = bp, yend = -1.1),color='#006600',size=0.5) +
  geom_text(data=subset(snp_use,!rsid%in%snp_list),aes(x = bp-4000, y = y, label=rsid),size=3,color='#006600') +
  geom_text(data=subset(snp_use,rsid%in%snp_list),aes(x = bp+4000, y = y, label=rsid),size=3,color='#006600') +
  theme(plot.margin = unit(c(b=0,t=0,l=0,r=0), "cm"))+
  geom_segment(data=snp_use,
               aes(x = bp, y = -5, xend = bp, yend = -0.8),color='#006600',size=0.5,linetype='dashed')
twas_res1 <- read.csv('NSCLC_spTWAS_Assoc_coloc_cytoband_fdr02_forTable.csv')
twas_res2 <- read.csv('LUAD_TWAS_Assoc_coloc_cytoband_fdr02_forTable.csv')
twas_res3 <- read.csv('LUSC_spTWAS_Assoc_coloc_cytoband_fdr02_forTable.csv')
twas_res <- rbind(twas_res1,twas_res2,twas_res3)
twas_res <- subset(twas_res,TWAS.FDR<0.1)
twas_res$gene_name <- gsub(' ','',twas_res$gene_name,fixed = T)
twas_res <- subset(twas_res,gene_name==gen_name)
twas_res <- twas_res[,c('ID','gene_name','EQTL.ID','BEST.EQTL.SNP','histology','IntronID')]
twas_res$clu_min <- sapply(strsplit(twas_res$IntronID,':'),'[',3)
twas_res$clu_max <- sapply(strsplit(twas_res$IntronID,':'),'[',4)
twas_res$xmin <- r1 - ((r - as.numeric(twas_res$clu_min))*(r1-l1)/(r-l))
twas_res$xmax <- r1 - ((r - as.numeric(twas_res$clu_max))*(r1-l1)/(r-l))
twas_res$yorder_index <- 1:nrow(twas_res)
twas_res$yorder <- min(sano$yorder)-twas_res$yorder_index
twas_res$rect_ymin <- min(sda$ymin)-twas_res$yorder_index
twas_res$rect_ymax <- min(sda$ymax)-twas_res$yorder_index
intron <- data.frame(yorder=twas_res$yorder,xmin=twas_res$xmin,xmax=twas_res$xmax)
intron_site <- data.frame(ymin=rep(twas_res$rect_ymin,nrow(intron)),ymax=rep(twas_res$rect_ymax,nrow(intron)))
intron1 <- data.frame(xmin=intron$xmin,xmax=intron$xmin+min(sda$end-sda$start))
intron2 <- data.frame(xmin=intron$xmax-min(sda$end-sda$start),xmax=intron$xmax)
intron_site_for_merge <- rbind(intron1,intron2)
intron_site <- cbind(intron_site,intron_site_for_merge)
p_trans_add_4 <- p_trans_add_3+
  geom_segment(data=intron,aes(x = xmin, y = yorder, xend = xmax, yend = yorder),colour = "#CCCCCC",size=3) +
  geom_rect(data=intron_site,aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),alpha = 1, show.legend = FALSE) +
  geom_text(data=twas_res,aes(x = xmin-0.2*(r1 - l1), y = yorder, label=IntronID),size=3)           
ggsave(p_trans_add_4,file='Figure 5E', width=12, height=11) 


AI中修改细节







6 Figure 5F


cd /data1/qtl/code_for_submit/data/Fig5F

R
library(ggplot2)
library(dplyr)
library(tidyr)
rm(list = ls())
psd <- read.csv('mean_PSI.csv')
data_long <- psd %>%
  pivot_longer(cols = GG:CC, names_to = "Genotype", values_to = "PSI")
data_long <- data_long %>%
  separate(id, into = c("Chr", "Start", "End"), sep = ":", convert = TRUE)
genotype_counts <- data.frame(
  Genotype = c("GG","CG", "CC"),
  N = c(21,125, 192))
data_long <- left_join(data_long, genotype_counts, by = "Genotype")
data_long$yorder <- factor(data_long$Genotype, levels = c("GG","CG", "CC"), labels = c(1, 2,3))
snp_pos <- 61372298
exonData <- data.frame(
  Start = rep(c(61372298, 61390187), 3),
  End = rep(c(61372331, 61390367), 3),
  yorder = c(1,1,2,2,3,3))
ggplot() +
  theme_classic() +
  geom_rect(data = exonData, aes(xmin = Start, xmax = End, ymin = as.numeric(yorder) - 0.05, ymax = as.numeric(yorder) + 0.05)) +
  # geom_rect(data = data_long, aes(xmin = End - 200, xmax = End, ymin = as.numeric(yorder) - 0.05, ymax = as.numeric(yorder) + 0.05), fill = "black") +
  geom_segment(data = data_long, aes(x = Start-2000, xend = End+1000, y = as.numeric(yorder), yend = as.numeric(yorder)), size = 1) +
  geom_curve(data = data_long, aes(x = Start, y = as.numeric(yorder), xend = End, yend = as.numeric(yorder), color = PSI, size = PSI),
             curvature = 0.4, alpha = 0.8, lineend = "butt") +
  geom_segment(aes(x = snp_pos, xend = snp_pos, y = 0.5, yend = 3.5), color = "#666666", linetype = "dotted", size = 1) +
  geom_text(aes(x = snp_pos + 200, y = 3.7, label = "Associated SNP:\nrs321394"), size = 5, hjust = 0) +
  geom_text(data = data_long, aes(x = (Start + End) / 2, y = as.numeric(yorder) - 0.3, label = paste0(round(PSI * 100, 2), "%")), size = 5) +
  geom_text(data = genotype_counts, aes(x = min(exonData$Start) - 200, y = as.numeric(factor(Genotype, levels = c("GG","CG", "CC"))), label = paste0(Genotype, "\n(n=", N, ")")), size = 6, hjust = 1) +
  scale_color_gradient(low = "#F9D2BD", high = "red", name = "") +
  scale_size(range = c(0.4, 2)) +xlab("") + ylab("") +
  theme(
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.line = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y = element_blank()) + xlab("") + ylab("") +theme(legend.position = "none")
ggsave("Figure 5F.pdf", width = 6, height = 5.5)


AI中修改细节







7 Figure 5G


cd /data1/qtl/code_for_submit/data/Fig5G-H

R
rm(list = ls())
library(data.table)
library(ggplot2)
require(gridExtra)
library(ggpubr)
exp <- fread('psi_rs1665258_exp.txt',data.table = F)
exp <- exp[,-1]
exp <- t(exp)
exp <- cbind(IID=rownames(exp),exp)
exp <- as.data.frame(exp)
names(exp)[2] <- 'KIAA1841'
query <- fread('../116_geno005_maf001_hwe06.bim_id_query')
cbPalette <- c("#FF9900","#FF9900","#FF9900")
gen <- read.table('338_rs1665258.raw',h=T,check.names = F)
gen <- subset(gen,!is.na(gen[,ncol(gen)]))
names(gen)[7] <- paste0(query$V7[which(query$V2==sapply(strsplit(names(gen)[7],'_'),'[',1))],'_',sapply(strsplit(names(gen)[7],'_'),'[',2))
allele_last <- sapply(strsplit(names(gen)[7],'_'),'[',1)
allele <- c(sapply(strsplit(allele_last,':'),'[',3),sapply(strsplit(allele_last,':'),'[',4))
alt_allele <- paste0(sapply(strsplit(names(gen)[7],'_'),'[',2),sapply(strsplit(names(gen)[7],'_'),'[',2))
ref_allele <- paste0(setdiff(allele,sapply(strsplit(names(gen)[7],'_'),'[',2)),setdiff(allele,sapply(strsplit(names(gen)[7],'_'),'[',2)))
mix_allele <- paste0(substring(ref_allele,1,1),substring(alt_allele,1,1))
gen$rs_n <- ifelse(gen[,7]==0,paste0(ref_allele,'\n(n=',sum(gen[,7]==0),')'), 
                   ifelse(gen[,7]==2, paste0(alt_allele,'\n(n=',sum(gen[,7]==2),')'),
                          paste0(mix_allele,'\n(n=',sum(gen[,7]==1),')')))
gen$rs_n <- factor(gen$rs_n, levels = c(paste0(ref_allele,'\n(n=',sum(gen[,7]==0),')'), 
                                        paste0(mix_allele,'\n(n=',sum(gen[,7]==1),')'), 
                                        paste0(alt_allele,'\n(n=',sum(gen[,7]==2),')')))
gen <- dplyr::left_join(gen,exp,by = 'IID')
gen[,9] <- as.numeric(gen[,9])
p0 <- ggplot(aes(x=gen$rs_n,y=gen[,ncol(gen)],fill = rs_n),data=gen) + 
  geom_jitter(alpha=0.2,position=position_jitterdodge(jitter.width = 0.35, 
                                                      jitter.height = 0, 
                                                      dodge.width = 0.25))+
  geom_boxplot(alpha=0.4,width=0.2,
               position=position_dodge(width=0.2),
               size=0.75,outlier.colour = NA)+
  geom_violin(alpha=0.2,width=0.7,
              position=position_dodge(width=0.2),
              size=0.75)+
  scale_fill_manual(values = cbPalette)+
  theme_classic() + 
  theme(legend.position="none") + 
  theme(text = element_text(size=16)) + 
  stat_compare_means(aes(group=gen$rs_n),label = "p.format",size=6)+
  ylab('Normalized intron usage') + xlab('rs1665258 genotype') + 
  theme(axis.text.y = element_text(size=16, color = "black"), 
        axis.text.x = element_text(size=16, color = "black"), 
        axis.title.y = element_text(size=16, color = "black"), 
        axis.title.x = element_text(size=16, color = "black"),
        plot.title = element_text(size=16, color = "black",hjust = 0.5,margin = margin(t = 0, r = 0, b = 1.5, l = 0,"cm"))) +
  theme(axis.line = element_line(colour = 'black', size = 0.5)) +
  theme(plot.margin = margin(t=0.2, r=0, b=0.2, l=0, "cm"))+
  stat_summary(fun.y = "median", geom = "point", size = 1) + 
  stat_summary(fun.y = "median", geom = "line", size = 0.8, aes(group = 1),color="#7E6148B2")
ggsave('Figure 5H',p0, width=5.2 ,height= 3.7)

# 查看QTL的P值和beta值,在对应的图片上使用AI进行修改
zgrep 2:61385100:A/G /data1/qtl/test/338_QTL/338_sQTL/338sample_sqtl.txt.gz | grep ENSG00000162929.9
# 2:61372331:61389977:clu_27445_+:ENSG00000162929.9 2:61385100:A/G  92094   141 160 0.236686    1.53909e-16 0.644987    0.0734069







8 Figure 5H


cd /data1/qtl/code_for_submit/data/Fig5G-H

R
rm(list = ls())
library(data.table)
library(ggplot2)
require(gridExtra)
library(ggpubr)
exp <- fread('KIAA1841_exp.txt',data.table = F)
exp <- exp[,-1]
exp <- t(exp)
exp <- cbind(IID=rownames(exp),exp)
exp <- as.data.frame(exp)
names(exp)[2] <- 'KIAA1841'
query <- fread('../116_geno005_maf001_hwe06.bim_id_query')
head(query)
cbPalette <- c("#0072B5","#0072B5","#0072B5")
gen <- read.table('338_KIAA1841.raw',h=T,check.names = F)
gen <- subset(gen,!is.na(gen[,ncol(gen)]))
names(gen)[7] <- paste0(query$V7[which(query$V2==sapply(strsplit(names(gen)[7],'_'),'[',1))],'_',sapply(strsplit(names(gen)[7],'_'),'[',2))
allele_last <- sapply(strsplit(names(gen)[7],'_'),'[',1)
allele <- c(sapply(strsplit(allele_last,':'),'[',3),sapply(strsplit(allele_last,':'),'[',4))
alt_allele <- paste0(sapply(strsplit(names(gen)[7],'_'),'[',2),sapply(strsplit(names(gen)[7],'_'),'[',2))
ref_allele <- paste0(setdiff(allele,sapply(strsplit(names(gen)[7],'_'),'[',2)),setdiff(allele,sapply(strsplit(names(gen)[7],'_'),'[',2)))
mix_allele <- paste0(substring(ref_allele,1,1),substring(alt_allele,1,1))

gen$rs_n <- ifelse(gen[,7]==0,paste0(ref_allele,'\n(n=',sum(gen[,7]==0),')'), 
                   ifelse(gen[,7]==2, paste0(alt_allele,'\n(n=',sum(gen[,7]==2),')'),
                          paste0(mix_allele,'\n(n=',sum(gen[,7]==1),')')))
gen$rs_n <- factor(gen$rs_n, levels = c(paste0(ref_allele,'\n(n=',sum(gen[,7]==0),')'), 
                                        paste0(mix_allele,'\n(n=',sum(gen[,7]==1),')'), 
                                        paste0(alt_allele,'\n(n=',sum(gen[,7]==2),')')))
gen <- dplyr::left_join(gen,exp,by = 'IID')
gen[,9] <- as.numeric(gen[,9])
p0 <- ggplot(aes(x=gen$rs_n,y=gen[,ncol(gen)],fill = rs_n),data=gen) + 
  geom_jitter(alpha=0.2,position=position_jitterdodge(jitter.width = 0.35, 
                                                      jitter.height = 0, 
                                                      dodge.width = 0.25))+
  geom_boxplot(alpha=0.4,width=0.2,
               position=position_dodge(width=0.2),
               size=0.75,outlier.colour = NA)+
  geom_violin(alpha=0.2,width=0.7,
              position=position_dodge(width=0.2),
              size=0.75)+
  scale_fill_manual(values = cbPalette)+
  theme_classic() + 
  theme(legend.position="none") + 
  theme(text = element_text(size=16)) + 
  stat_compare_means(aes(group=gen$rs_n),label = "p.format",size=6)+
  ylab('Normalized gene expression') + xlab('rs1665258 genotype')+ 
  theme(axis.text.y = element_text(size=16, color = "black"), 
        axis.text.x = element_text(size=16, color = "black"), 
        axis.title.y = element_text(size=16, color = "black"), 
        axis.title.x = element_text(size=16, color = "black"),
        plot.title = element_text(size=16, color = "black",hjust = 0.5,margin = margin(t = 0, r = 0, b = 1.5, l = 0,"cm"))) +
  theme(axis.line = element_line(colour = 'black', size = 0.5)) +
  theme(plot.margin = margin(t=0.2, r=0, b=0.2, l=0, "cm"))+
  stat_summary(fun.y = "median", geom = "point", size = 1) + 
  stat_summary(fun.y = "median", geom = "line", size = 0.8, aes(group = 1),color="#7E6148B2")+labs(title = 'KIAA1841')
ggsave('Figure 5H',p0, width=5.8 ,height= 5.7)

# 查看QTL的P值和beta值,在对应的图片上使用AI进行修改
zgrep 2:61385100:A/G /data1/qtl/test/338_QTL/338_eQTL/338sample_eqtl.txt.gz | grep ENSG00000162929.9
# ENSG00000162929.9 2:61385100:A/G  92094   141 160 0.236686    1.50021e-08 0.381469    0.0654129





Working directory: Server 39